
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()">
<input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Table of agenda :
summaryall = pd.read_csv("summaryall.csv",error_bad_lines=False,nrows = 10000)
summaryall
Data been through many tasks before Clustring and data reduction :
That included preparing data
- Data Claening
- Distribution of Features
- Identfiying missing values in dataset
- Handlled mising data
- visualaisation
- converted categorical and ordinal features into numeric features
- Created dummy features
- Transformed a categorical feature into a set of dummy features, each representing a unique category
- Transformed a categorical feature into a numeric values
- scaling , and Normalisation
This document for :
- Corrleation (scaled and nonscaled )
- Clustering of both data (scaled and nonscaled )
- Drop highly correlated features
- PCA both (scaled and nonscaled ) Feature Extraction
- k-fold , Elbow method to select optimal K
- Checking optimal number of Clusters
# before clustering
IntNoDumKmeans.columns
scaled_IntNoDumKmean.columns
#data
#correlation map
f,ax = plt.subplots(figsize=(8,8))
sns.heatmap(data.corr(), annot=True, linewidths=0.5, fmt= '.1f',ax=ax)
plt.show()
#correlation map
#note tha ; correlation of scaled_IntNoDumKmeant this is before kmean
f,ax = plt.subplots(figsize=(8,8))
sns.heatmap(scaled_IntNoDumKmean.corr(), annot=True, linewidths=0.5, fmt= '.1f',ax=ax)
plt.show()
#intdum
#correlation map
f,ax = plt.subplots(figsize=(8,8))
sns.heatmap(IntNoDum.corr(), annot=True, linewidths=0.5, fmt= '.1f',ax=ax)
plt.show()
dummiesNum.info()
#dummiesNum
#correlation map
f,ax = plt.subplots(figsize=(8,8))
sns.heatmap(dummiesNum.corr(), annot=True, linewidths=0.5, fmt= '.1f',ax=ax)
plt.show()
# plot scaleed correlated values
plt.rcParams['figure.figsize'] = [16, 6]
fig, ax = plt.subplots(nrows=1, ncols=3)
ax=ax.flatten()
cols = ['Offence', 'Street', 'District']
colors=['#415952', '#f35134', '#243AB5', '#243AB5']
j=0
for i in ax:
if j==0:
i.set_ylabel('Beat_Num')
i.scatter(IntNoDum[cols[j]], IntNoDum['Beat_Num'], alpha=0.5, color=colors[j])
i.set_xlabel(cols[j])
i.set_title('Pearson: %s'%IntNoDum.corr().loc[cols[j]]['Beat_Num'].round(2)+' Spearman: %s'%scaled_IntNoDum.corr(method='spearman').loc[cols[j]]['Beat_Num'].round(2))
j+=1
plt.show()
def doKmeans(scaled_IntNoDumKmean, nclust=5):
model = KMeans(nclust)
model.fit(scaled_IntNoDumKmean)
clust_labels = model.predict(scaled_IntNoDumKmean)
cent = model.cluster_centers_
return (clust_labels, cent)
clust_labels, cent = doKmeans(scaled_IntNoDumKmean, 2)
kmeans = pd.DataFrame(clust_labels)
scaled_IntNoDumKmean.insert((scaled_IntNoDumKmean.shape[1]),'kmeans',kmeans)
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(scaled_IntNoDumKmean['District'],scaled_IntNoDumKmean['Beat_Num'],
c=kmeans[0],s=25)
ax.set_title('K-Means Clustering')
ax.set_xlabel('District per Beat_Num')
ax.set_ylabel('Beat_Num')
plt.colorbar(scatter)
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(scaled_IntNoDumKmean['Street'],scaled_IntNoDumKmean['Beat_Num'],
c=kmeans[0],s=25)
ax.set_title('K-Means Clustering')
ax.set_xlabel('Street per Beat_Num')
ax.set_ylabel('Street')
plt.colorbar(scatter)
#Offence
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(scaled_IntNoDumKmean['Offence'],scaled_IntNoDumKmean['Beat_Num'],
c=kmeans[0],s=25)
ax.set_title('K-Means Clustering')
ax.set_xlabel('Offence per Beat_Num')
ax.set_ylabel('Offence')
plt.colorbar(scatter)
def doKmeans(IntNoDum, nclust=5):
model = KMeans(nclust)
model.fit(IntNoDum)
clust_labels = model.predict(IntNoDum)
cent = model.cluster_centers_
return (clust_labels, cent)
clust_labels, cent = doKmeans(IntNoDum, 2)
kmeans = pd.DataFrame(clust_labels)
IntNoDum.insert((IntNoDum.shape[1]),'kmeans',kmeans)
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(IntNoDum['District'],IntNoDum['Beat_Num'],
c=kmeans[0],s=25)
ax.set_title('K-Means IntNoDum Clustering')
ax.set_xlabel('District per Beat_Num')
ax.set_ylabel('Beat_Num')
plt.colorbar(scatter)
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(IntNoDum['Street'],IntNoDum['Beat_Num'],
c=kmeans[0],s=25)
ax.set_title('K-Means IntNoDum Clustering')
ax.set_xlabel('Street per Beat_Num')
ax.set_ylabel('Street')
plt.colorbar(scatter)
#Offence
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(IntNoDum['Offence'],IntNoDum['Beat_Num'],
c=kmeans[0],s=25)
ax.set_title('K-Means IntNoDum Clustering')
ax.set_xlabel('Offence per Beat_Num')
ax.set_ylabel('Offence')
plt.colorbar(scatter)
IntNoDumKmeans = IntNoDum
IntNoDumKmeans.columns
IntNoDumKmeans[["Street", "District", "Town", "Post_Code", "Offence", "MO_Desc", "Beat_Num", "kmeans"]].head()
IntNoDumKmeans[["Street", "District", "Town", "Post_Code", "Offence", "MO_Desc", "Beat_Num", "kmeans"]].tail()
from sklearn.decomposition import PCA
pca = PCA(n_components=19)
pca.fit(scaled_IntNoDumKmean)
print(pca.explained_variance_ratio_)
plt.figure(figsize=(15,7))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of IntNoDumKmeans components')
plt.ylabel('cumulative explained variance');
from sklearn.decomposition import PCA
pca = PCA(n_components=19)
pca.fit(IntNoDumKmeans)
print(pca.explained_variance_ratio_)
plt.figure(figsize=(15,7))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of IntNoDumKmeans components')
plt.ylabel('cumulative explained variance');
IntNoDum
#IntNoDum
from sklearn.decomposition import PCA
pca = PCA(n_components=19)
pca.fit(IntNoDum)
print(pca.explained_variance_ratio_)
plt.figure(figsize=(15,7))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of IntNoDum components')
plt.ylabel('cumulative explained variance');
data.head()
data.columns
data = pd.read_csv("Crime_comma.csv",error_bad_lines=False,nrows = 10000)
#drop
data = scaled_IntNoDumKmean.drop(['Crime_Ref', 'Crime_Num'], axis=1)
## EROR becuse could not convert string to float: '99Y4/463197/19'
## we need to drop string
from sklearn.decomposition import PCA
pca = PCA(n_components=18)
pca.fit(data)
print(pca.explained_variance_ratio_)
plt.figure(figsize=(15,7))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of IntNoDum components')
plt.ylabel('cumulative explained variance');
Computes the Euclidean distance between two 1-D arrays.
The Euclidean distance between 1-D arrays u and v, is defined as
.. math::
$${||u-v||}_2$$$$\left(\sum{(w_i |(u_i - v_i)|^2)}\right)^{1/2}$$Manhattan distance between two 1-D arrays u and v,which is defined as
.. math::
$$\sum_i {\left| u_i - v_i \right|}.$$
and known inpython as dist.cityblock(AA, BB)
Parameters
u : (N,) array_like Input array.
v : (N,) array_like Input array.
w : (N,) array_like,
optional
The weights for each value in u and v. Default is None, which gives each value a weight of 1.0
Chebyshev -- also chessboard -- distance is best defined as a distance metric
math::
$$ \max_i {|u_i-v_i|}.$$
the Cosine distance Compute distance between 1-D arrays. between u and v, is defined as
math::
$$ 1 - \frac{u \cdot v} {||u||_2 ||v||_2} $$where :math:u \cdot v is the dot product of :math:u and
:math:v.
import scipy.spatial.distance as dist
dist.euclidean(scaled_IntNoDumKmean.District, scaled_IntNoDumKmean.Offence)
print('\nEuclidean distance is =', dist.euclidean(scaled_IntNoDumKmean.District, scaled_IntNoDumKmean.Offence))
print()
print('Manhattan distance is =', dist.cityblock(scaled_IntNoDumKmean.District, scaled_IntNoDumKmean.Offence))
print()
print('Chebyshev distance is =', dist.chebyshev(scaled_IntNoDumKmean.District, scaled_IntNoDumKmean.Offence))
print()
print('Canberra distance is =', dist.canberra(scaled_IntNoDumKmean.District, scaled_IntNoDumKmean.Offence))
print()
print('Cosine distance is =', dist.cosine(scaled_IntNoDumKmean.District, scaled_IntNoDumKmean.Offence))
#Chebyshev -- also chessboard -- distance is best defined as a distance metric
dist.cityblock(scaled_IntNoDumKmean.District, scaled_IntNoDumKmean.Offence)
dist.chebyshev(scaled_IntNoDumKmean.District, scaled_IntNoDumKmean.Offence)
dist.cosine(scaled_IntNoDumKmean.District, scaled_IntNoDumKmean.Offence)